load in the data and convert to a data frame

revisions <- do.call(rbind,strsplit(readLines("data/all_revisions_1000_articles.txt"), "<<sep>>",fixed=T))
head(revisions)
##      [,1]   [,2]      [,3]                          [,4]                  
## [1,] "47"   "233248"  "AbalonE"                     "2001-01-28T09:38:56Z"
## [2,] "3527" "383723"  "Military_of_Bassas_da_India" "2002-02-25T15:51:15Z"
## [3,] "6330" "243308"  "Clement_Martyn_Doke"         "2001-09-13T17:30:56Z"
## [4,] "6330" "882117"  "Clement_Martyn_Doke"         "2002-02-25T15:51:15Z"
## [5,] "6330" "4893101" "Clement_Martyn_Doke"         "2003-05-02T10:50:27Z"
## [6,] "6330" "4931724" "Clement_Martyn_Doke"         "2004-07-28T22:05:55Z"
##      [,5]                   [,6]                   [,7] [,8]
## [1,] "BryceHarrington"      "3684"                 ""   ""  
## [2,] "ip:Conversion_script" "ip:Conversion_script" ""   ""  
## [3,] "BenBaker"             "256"                  ""   ""  
## [4,] "ip:Conversion_script" "ip:Conversion_script" ""   ""  
## [5,] "JohnOwens"            "4558"                 ""   ""  
## [6,] "Evertype"             "58589"                ""   ""  
##      [,9]                                                                                                     
## [1,] ""                                                                                                       
## [2,] "Bassas_da_India"                                                                                        
## [3,] "1980 Linguist 1893 South_African"                                                                       
## [4,] "1980 Linguist 1893 Africa South_African"                                                                
## [5,] "1980 Linguist 1893 Africa South_African"                                                                
## [6,] "1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language"
##      [,10] [,11] [,12] [,13] [,14] [,15]
## [1,] ""    ""    ""    ""    ""    ""   
## [2,] ""    ""    ""    ""    ""    ""   
## [3,] ""    ""    ""    ""    ""    ""   
## [4,] ""    ""    ""    ""    ""    ""   
## [5,] ""    ""    ""    ""    ""    ""   
## [6,] ""    ""    ""    ""    ""    ""   
##      [,16]                                                                                           
## [1,] "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG"
## [2,] "Automated conversion"                                                                          
## [3,] "*"                                                                                             
## [4,] "Automated conversion"                                                                          
## [5,] "sentence"                                                                                      
## [6,] ""                                                                                              
##      [,17] [,18]
## [1,] "0"   "118"
## [2,] "1"   "16" 
## [3,] "1"   "8"  
## [4,] "1"   "15" 
## [5,] "1"   "16" 
## [6,] "0"   "47"
revisions_processed <- 
  setNames(
    as.data.frame(lapply(1:ncol(revisions), function (i) {
      type.convert(revisions[,i], as.is = TRUE)
    }), stringsAsFactors = FALSE),
    c("article_id", "rev_id", "article_title", "timestamp", "[ip:]username", "user_id", "CATEGORY", "IMAGE", "MAIN", "TALK", "USER", "USER_TALK", "OTHER", "EXTERNAL",
      "TEMPLATE", "COMMENT", "MINOR", "TEXTDATA")
  )
library(tidyr)
revisions_processed <- separate(data = revisions_processed, col = timestamp, into = c('date', 'time'), sep = "T")
str(revisions_processed)
## 'data.frame':    42363 obs. of  19 variables:
##  $ article_id   : int  47 3527 6330 6330 6330 6330 6330 6330 6330 6330 ...
##  $ rev_id       : int  233248 383723 243308 882117 4893101 4931724 4931731 5518069 5541683 5629426 ...
##  $ article_title: chr  "AbalonE" "Military_of_Bassas_da_India" "Clement_Martyn_Doke" "Clement_Martyn_Doke" ...
##  $ date         : chr  "2001-01-28" "2002-02-25" "2001-09-13" "2002-02-25" ...
##  $ time         : chr  "09:38:56Z" "15:51:15Z" "17:30:56Z" "15:51:15Z" ...
##  $ [ip:]username: chr  "BryceHarrington" "ip:Conversion_script" "BenBaker" "ip:Conversion_script" ...
##  $ user_id      : chr  "3684" "ip:Conversion_script" "256" "ip:Conversion_script" ...
##  $ CATEGORY     : chr  "" "" "" "" ...
##  $ IMAGE        : chr  "" "" "" "" ...
##  $ MAIN         : chr  "" "Bassas_da_India" "1980 Linguist 1893 South_African" "1980 Linguist 1893 Africa South_African" ...
##  $ TALK         : chr  "" "" "" "" ...
##  $ USER         : chr  "" "" "" "" ...
##  $ USER_TALK    : chr  "" "" "" "" ...
##  $ OTHER        : chr  "" "" "" "" ...
##  $ EXTERNAL     : chr  "" "" "" "" ...
##  $ TEMPLATE     : chr  "" "" "" "" ...
##  $ COMMENT      : chr  "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG" "Automated conversion" "*" "Automated conversion" ...
##  $ MINOR        : int  0 1 1 1 1 0 0 0 1 1 ...
##  $ TEXTDATA     : int  118 16 8 15 16 47 53 54 54 54 ...
head(revisions_processed)
##   article_id  rev_id               article_title       date      time
## 1         47  233248                     AbalonE 2001-01-28 09:38:56Z
## 2       3527  383723 Military_of_Bassas_da_India 2002-02-25 15:51:15Z
## 3       6330  243308         Clement_Martyn_Doke 2001-09-13 17:30:56Z
## 4       6330  882117         Clement_Martyn_Doke 2002-02-25 15:51:15Z
## 5       6330 4893101         Clement_Martyn_Doke 2003-05-02 10:50:27Z
## 6       6330 4931724         Clement_Martyn_Doke 2004-07-28 22:05:55Z
##          [ip:]username              user_id CATEGORY IMAGE
## 1      BryceHarrington                 3684               
## 2 ip:Conversion_script ip:Conversion_script               
## 3             BenBaker                  256               
## 4 ip:Conversion_script ip:Conversion_script               
## 5            JohnOwens                 4558               
## 6             Evertype                58589               
##                                                                                                      MAIN
## 1                                                                                                        
## 2                                                                                         Bassas_da_India
## 3                                                                        1980 Linguist 1893 South_African
## 4                                                                 1980 Linguist 1893 Africa South_African
## 5                                                                 1980 Linguist 1893 Africa South_African
## 6 1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language
##   TALK USER USER_TALK OTHER EXTERNAL TEMPLATE
## 1                                            
## 2                                            
## 3                                            
## 4                                            
## 5                                            
## 6                                            
##                                                                                          COMMENT
## 1 I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG
## 2                                                                           Automated conversion
## 3                                                                                              *
## 4                                                                           Automated conversion
## 5                                                                                       sentence
## 6                                                                                               
##   MINOR TEXTDATA
## 1     0      118
## 2     1       16
## 3     1        8
## 4     1       15
## 5     1       16
## 6     0       47

I want to grab all the categories for each article_id

categories <- revisions_processed %>%
  group_by(article_id) %>%
  summarise(CATEGORIES=paste(CATEGORY, collapse = " "))
## Warning: package 'bindrcpp' was built under R version 3.4.4
head(categories, 10) 
## # A tibble: 10 x 2
##    article_id CATEGORIES                                                  
##         <int> <chr>                                                       
##  1         47 ""                                                          
##  2       3527 ""                                                          
##  3       6330 "    Notable_South_Africans Notable_South_Africans Notable_…
##  4      10864 "             Free_software Free_software Free_software Fre…
##  5      20072 "                 Assemblers Assemblers Assemblers Assemble…
##  6      21494 "                                                          …
##  7      26582 "                                              Scottish_pol…
##  8      28056 " "                                                         
##  9      30059 "                                                          …
## 10      31025 "  "
categories$CATEGORIES <- sapply(strsplit(categories$CATEGORIES, split=" "), function(x) {
  paste0(unique(trimws(x)), collapse = ', ')
})


library(tidyverse)
## ── Attaching packages ────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
categoriesdf <- as.data.frame(str_split_fixed(categories$CATEGORIES, ", ", max(unlist(lapply(strsplit(categories$CATEGORIES, ", "), length)))))
categoriesdf <- categoriesdf[,-1]
names(categoriesdf) <- paste0("category_", 1:ncol(categoriesdf))
categoriesdf <- cbind(article_id = categories$article_id, categoriesdf)
head(categoriesdf, 10)
##    article_id             category_1             category_2
## 1          47                                              
## 2        3527                                              
## 3        6330 Notable_South_Africans   South_African_people
## 4       10864          Free_software Free_software_licenses
## 5       20072             Assemblers                       
## 6       21494            Subcultures       Sociolinguistics
## 7       26582   Scottish_politicians    History_of_Scotland
## 8       28056                                              
## 9       30059                 Turkey        Greek_mythology
## 10      31025                                              
##           category_3        category_4                     category_5
## 1                                                                    
## 2                                                                    
## 3          Linguists       1980_deaths                    1893_births
## 4  Software_licenses             Libre                               
## 5                                                                    
## 6              Slang       Stereotypes Customary_categories_of_people
## 7           Scotland Scottish_monarchs                    1329_deaths
## 8                                                                    
## 9        Archaeology    Hittite_Empire           Ancient_Greek_cities
## 10                                                                   
##                 category_6        category_7           category_8
## 1                                                                
## 2                                                                
## 3  South_African_linguists                                       
## 4                                                                
## 5                                                                
## 6                    Nerds Slang_expressions     Stock_characters
## 7              1274_births    House_of_Bruce            Defectors
## 8                                                                
## 9      Geography_of_Turkey           Trojans Archaeological_sites
## 10                                                               
##                          category_9             category_10
## 1                                                          
## 2                                                          
## 3                                                          
## 4                                                          
## 5                                                          
## 6                            People Sfasdfalang_expressions
## 7  Earls_in_the_Peerage_of_Scotland   Guardians_of_Scotland
## 8                                                          
## 9    Archaeological_sites_in_Turkey   Lost_cities_and_towns
## 10                                                         
##                      category_11           category_12    category_13
## 1                                                                    
## 2                                                                    
## 3                                                                    
## 4                                                                    
## 5                                                                    
## 6           Anti-intellectualism                                     
## 7  Wars_of_Scottish_Independence Dumfries_and_Galloway Medieval_Gaels
## 8                                                                    
## 9               Destroyed_cities Greek_sites_in_Turkey     Eccentrics
## 10                                                                   
##        category_14                category_15
## 1                                            
## 2                                            
## 3                                            
## 4                                            
## 5                                            
## 6                                            
## 7  Scottish_people          Medieval_Scotland
## 8                                            
## 9    Patent_clerks Jewish-American_scientists
## 10                                           
##                         category_16           category_17     category_18
## 1                                                                        
## 2                                                                        
## 3                                                                        
## 4                                                                        
## 5                                                                        
## 6                                                                        
## 7  Natives_of_Dumfries_and_Galloway High_Kings_of_Ireland Revolutionaries
## 8                                                                        
## 9                       1879_births         Humanitarians       Humanists
## 10                                                                       
##    category_19              category_20     category_21
## 1                                                      
## 2                                                      
## 3                                                      
## 4                                                      
## 5                                                      
## 6                                                      
## 7       Rebels Scottish_Roman_Catholics House_of_Glover
## 8                                                      
## 9  1955_deaths              Vegetarians        Refugees
## 10                                                     
##                                           category_22
## 1                                                    
## 2                                                    
## 3                                                    
## 4                                                    
## 5                                                    
## 6                                                    
## 7  People_excommunicated_by_the_Roman_Catholic_Church
## 8                                                    
## 9           Naturalized_citizens_of_the_United_States
## 10                                                   
##                          category_23     category_24
## 1                                                   
## 2                                                   
## 3                                                   
## 4                                                   
## 5                                                   
## 6                                                   
## 7  People_from_Dumfries_and_Galloway Scottish_rebels
## 8                                                   
## 9                         Socialists  Social_justice
## 10                                                  
##                           category_25                category_26
## 1                                                               
## 2                                                               
## 3                                                               
## 4                                                               
## 5                                                               
## 6                                                               
## 7                People_from_Ayrshire People_from_South_Ayrshire
## 8                                                               
## 9  Contributors_to_general_relativity                Autodidacts
## 10                                                              
##                        category_27                  category_28
## 1                                                              
## 2                                                              
## 3                                                              
## 4                                                              
## 5                                                              
## 6                                                              
## 7  Scottish_Gaelic-speaking_people                             
## 8                                                              
## 9                Jewish_scientists Natives_of_Baden-Württemberg
## 10                                                             
##          category_29  category_30       category_31      category_32
## 1                                                                   
## 2                                                                   
## 3                                                                   
## 4                                                                   
## 5                                                                   
## 6                                                                   
## 7                                                                   
## 8                                                                   
## 9  Manhattan_Project Cosmologists German_scientists German-Americans
## 10                                                                  
##          category_33    category_34 category_35               category_36
## 1                                                                        
## 2                                                                        
## 3                                                                        
## 4                                                                        
## 5                                                                        
## 6                                                                        
## 7                                                                        
## 8                                                                        
## 9  World_federalists Erdős_number_2  Physicists Formerly_stateless_people
## 10                                                                       
##                       category_37       category_38     category_39
## 1                                                                  
## 2                                                                  
## 3                                                                  
## 4                                                                  
## 5                                                                  
## 6                                                                  
## 7                                                                  
## 8                                                                  
## 9  Nobel_Prize_in_Physics_winners German_physicists Albert_Einstein
## 10                                                                 
##            category_40                   category_41
## 1                                                   
## 2                                                   
## 3                                                   
## 4                                                   
## 5                                                   
## 6                                                   
## 7                                                   
## 8                                                   
## 9  Aegean_civilization Ancient_Greek_sites_in_Turkey
## 10                                                  
##                       category_42                  category_43
## 1                                                             
## 2                                                             
## 3                                                             
## 4                                                             
## 5                                                             
## 6                                                             
## 7                                                             
## 8                                                             
## 9  World_Heritage_Sites_in_Turkey Locations_in_Greek_mythology
## 10                                                            
##                 category_44
## 1                          
## 2                          
## 3                          
## 4                          
## 5                          
## 6                          
## 7                          
## 8                          
## 9  National_parks_of_Turkey
## 10
str(categoriesdf)
## 'data.frame':    1015 obs. of  45 variables:
##  $ article_id : int  47 3527 6330 10864 20072 21494 26582 28056 30059 31025 ...
##  $ category_1 : Factor w/ 632 levels "","_1934_births",..: 1 1 452 301 124 547 516 1 588 1 ...
##  $ category_2 : Factor w/ 501 levels "","_1963_deaths",..: 1 1 424 244 1 420 277 1 264 1 ...
##  $ category_3 : Factor w/ 372 levels "","_1870_births",..: 1 1 231 321 1 319 309 1 87 1 ...
##  $ category_4 : Factor w/ 288 levels "","_1920_Year_of_birth",..: 1 1 37 187 1 263 249 1 168 1 ...
##  $ category_5 : Factor w/ 205 levels "","_Cancelled_PC_games",..: 1 1 17 1 1 85 11 1 56 1 ...
##  $ category_6 : Factor w/ 167 levels "","_Computer_and_video_games_based_on_licensed_properties",..: 1 1 149 1 1 120 10 1 88 1 ...
##  $ category_7 : Factor w/ 129 levels "","_Animated_Television_Series",..: 1 1 1 1 1 113 71 1 121 1 ...
##  $ category_8 : Factor w/ 98 levels "","_1897_births",..: 1 1 1 1 1 94 40 1 24 1 ...
##  $ category_9 : Factor w/ 77 levels "","_Irish-Americans",..: 1 1 1 1 1 52 22 1 9 1 ...
##  $ category_10: Factor w/ 66 levels "","_6teen_Characters",..: 1 1 1 1 1 59 33 1 38 1 ...
##  $ category_11: Factor w/ 50 levels "","1956_births",..: 1 1 1 1 1 14 49 1 21 1 ...
##  $ category_12: Factor w/ 41 levels "","1920_births",..: 1 1 1 1 1 1 16 1 21 1 ...
##  $ category_13: Factor w/ 36 levels "","6teen_characters",..: 1 1 1 1 1 1 19 1 14 1 ...
##  $ category_14: Factor w/ 29 levels "","_PlayStation_Portable_games",..: 1 1 1 1 1 1 21 1 18 1 ...
##  $ category_15: Factor w/ 24 levels "","1976_deaths",..: 1 1 1 1 1 1 15 1 12 1 ...
##  $ category_16: Factor w/ 21 levels "","1879_births",..: 1 1 1 1 1 1 12 1 2 1 ...
##  $ category_17: Factor w/ 18 levels "","American_baseball_players",..: 1 1 1 1 1 1 11 1 12 1 ...
##  $ category_18: Factor w/ 17 levels "","Albany_Capitals_players",..: 1 1 1 1 1 1 17 1 9 1 ...
##  $ category_19: Factor w/ 13 levels "","1955_deaths",..: 1 1 1 1 1 1 11 1 2 1 ...
##  $ category_20: Factor w/ 12 levels "","Miami_Vice_cast_members",..: 1 1 1 1 1 1 7 1 12 1 ...
##  $ category_21: Factor w/ 9 levels "","24_(TV_series)_cast_members",..: 1 1 1 1 1 1 6 1 9 1 ...
##  $ category_22: Factor w/ 8 levels "","African_American_sportspeople",..: 1 1 1 1 1 1 8 1 6 1 ...
##  $ category_23: Factor w/ 7 levels "","Christianity_in_Oxford",..: 1 1 1 1 1 1 5 1 6 1 ...
##  $ category_24: Factor w/ 5 levels "","Fame_(TV_series)_cast_members",..: 1 1 1 1 1 1 3 1 4 1 ...
##  $ category_25: Factor w/ 4 levels "","Contributors_to_general_relativity",..: 1 1 1 1 1 1 4 1 2 1 ...
##  $ category_26: Factor w/ 4 levels "","Autodidacts",..: 1 1 1 1 1 1 4 1 2 1 ...
##  $ category_27: Factor w/ 3 levels "","Jewish_scientists",..: 1 1 1 1 1 1 3 1 2 1 ...
##  $ category_28: Factor w/ 2 levels "","Natives_of_Baden-Württemberg": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_29: Factor w/ 2 levels "","Manhattan_Project": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_30: Factor w/ 2 levels "","Cosmologists": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_31: Factor w/ 2 levels "","German_scientists": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_32: Factor w/ 2 levels "","German-Americans": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_33: Factor w/ 2 levels "","World_federalists": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_34: Factor w/ 2 levels "","Erdős_number_2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_35: Factor w/ 2 levels "","Physicists": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_36: Factor w/ 2 levels "","Formerly_stateless_people": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_37: Factor w/ 2 levels "","Nobel_Prize_in_Physics_winners": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_38: Factor w/ 2 levels "","German_physicists": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_39: Factor w/ 2 levels "","Albert_Einstein": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_40: Factor w/ 2 levels "","Aegean_civilization": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_41: Factor w/ 2 levels "","Ancient_Greek_sites_in_Turkey": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_42: Factor w/ 2 levels "","World_Heritage_Sites_in_Turkey": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_43: Factor w/ 2 levels "","Locations_in_Greek_mythology": 1 1 1 1 1 1 1 1 2 1 ...
##  $ category_44: Factor w/ 2 levels "","National_parks_of_Turkey": 1 1 1 1 1 1 1 1 2 1 ...

Now to tidy it up: from wide to long format replace empties with NA and filter them out

library(dplyr)
df3 <- categoriesdf %>% 
  gather(category, CATEGORIES, -article_id) %>%
  replace(. == "", NA) %>%
  filter(!is.na(CATEGORIES)) %>%
  select(-category) %>%
  group_by(CATEGORIES) %>%
  summarise(number = n()) %>%
  arrange(desc(number)) 
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(df3, 10)
## # A tibble: 10 x 2
##    CATEGORIES                            number
##    <chr>                                  <int>
##  1 Living_people                             84
##  2 Year_of_birth_missing                     12
##  3 Year_of_birth_missing_(living_people)     10
##  4 Articles_for_deletion                      6
##  5 2004_albums                                5
##  6 Debut_albums                               5
##  7 1958_births                                4
##  8 1977_births                                4
##  9 2007_albums                                4
## 10 American_film_actors                       4
str(df3)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2662 obs. of  2 variables:
##  $ CATEGORIES: chr  "Living_people" "Year_of_birth_missing" "Year_of_birth_missing_(living_people)" "Articles_for_deletion" ...
##  $ number    : int  84 12 10 6 5 5 4 4 4 4 ...

Date work

Separate the date, create monthly counts

date_split <- separate(data = revisions_processed, col = date, into = c('year', 'month', 'day'), sep = "-")
monthlycounts <- date_split %>%
  group_by(article_id, year, month) %>%
  summarise(count = n())
arrange(monthlycounts, article_id, year, month)
revisions_to_dates <- revisions_processed
revisions_to_dates$date <- as.Date(revisions_to_dates$date, '%Y-%m-%d')
str(revisions_to_dates)
## 'data.frame':    42363 obs. of  19 variables:
##  $ article_id   : int  47 3527 6330 6330 6330 6330 6330 6330 6330 6330 ...
##  $ rev_id       : int  233248 383723 243308 882117 4893101 4931724 4931731 5518069 5541683 5629426 ...
##  $ article_title: chr  "AbalonE" "Military_of_Bassas_da_India" "Clement_Martyn_Doke" "Clement_Martyn_Doke" ...
##  $ date         : Date, format: "2001-01-28" "2002-02-25" ...
##  $ time         : chr  "09:38:56Z" "15:51:15Z" "17:30:56Z" "15:51:15Z" ...
##  $ [ip:]username: chr  "BryceHarrington" "ip:Conversion_script" "BenBaker" "ip:Conversion_script" ...
##  $ user_id      : chr  "3684" "ip:Conversion_script" "256" "ip:Conversion_script" ...
##  $ CATEGORY     : chr  "" "" "" "" ...
##  $ IMAGE        : chr  "" "" "" "" ...
##  $ MAIN         : chr  "" "Bassas_da_India" "1980 Linguist 1893 South_African" "1980 Linguist 1893 Africa South_African" ...
##  $ TALK         : chr  "" "" "" "" ...
##  $ USER         : chr  "" "" "" "" ...
##  $ USER_TALK    : chr  "" "" "" "" ...
##  $ OTHER        : chr  "" "" "" "" ...
##  $ EXTERNAL     : chr  "" "" "" "" ...
##  $ TEMPLATE     : chr  "" "" "" "" ...
##  $ COMMENT      : chr  "I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG" "Automated conversion" "*" "Automated conversion" ...
##  $ MINOR        : int  0 1 1 1 1 0 0 0 1 1 ...
##  $ TEXTDATA     : int  118 16 8 15 16 47 53 54 54 54 ...
revisions_to_dates$date <- format(revisions_to_dates$date, format="%Y-%m")

head(revisions_to_dates)
##   article_id  rev_id               article_title    date      time
## 1         47  233248                     AbalonE 2001-01 09:38:56Z
## 2       3527  383723 Military_of_Bassas_da_India 2002-02 15:51:15Z
## 3       6330  243308         Clement_Martyn_Doke 2001-09 17:30:56Z
## 4       6330  882117         Clement_Martyn_Doke 2002-02 15:51:15Z
## 5       6330 4893101         Clement_Martyn_Doke 2003-05 10:50:27Z
## 6       6330 4931724         Clement_Martyn_Doke 2004-07 22:05:55Z
##          [ip:]username              user_id CATEGORY IMAGE
## 1      BryceHarrington                 3684               
## 2 ip:Conversion_script ip:Conversion_script               
## 3             BenBaker                  256               
## 4 ip:Conversion_script ip:Conversion_script               
## 5            JohnOwens                 4558               
## 6             Evertype                58589               
##                                                                                                      MAIN
## 1                                                                                                        
## 2                                                                                         Bassas_da_India
## 3                                                                        1980 Linguist 1893 South_African
## 4                                                                 1980 Linguist 1893 Africa South_African
## 5                                                                 1980 Linguist 1893 Africa South_African
## 6 1980 1893 Africa South_African Click_consonant Bantu_languages Khoisan_languages Linguist Zulu_language
##   TALK USER USER_TALK OTHER EXTERNAL TEMPLATE
## 1                                            
## 2                                            
## 3                                            
## 4                                            
## 5                                            
## 6                                            
##                                                                                          COMMENT
## 1 I had scallops for dinner tonight; wonder what Abalone taste like...? Compliments of EB and PG
## 2                                                                           Automated conversion
## 3                                                                                              *
## 4                                                                           Automated conversion
## 5                                                                                       sentence
## 6                                                                                               
##   MINOR TEXTDATA
## 1     0      118
## 2     1       16
## 3     1        8
## 4     1       15
## 5     1       16
## 6     0       47
collapsed <- revisions_to_dates %>%
  group_by(article_id, date) %>%
  summarise(count = n())
arrange(collapsed, desc(count))
## # A tibble: 8,856 x 3
## # Groups:   article_id [1,015]
##    article_id date    count
##         <int> <chr>   <int>
##  1    8312072 2006-12   297
##  2      21494 2006-11   249
##  3      21494 2007-01   241
##  4      21494 2007-05   232
##  5      21494 2007-04   206
##  6     275510 2007-03   200
##  7      21494 2006-10   199
##  8      21494 2007-02   197
##  9      21494 2007-09   197
## 10    8918937 2007-07   185
## # ... with 8,846 more rows
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# USE THIS
ggplot(data = collapsed, 
       aes(x=as.yearmon(date), 
           y = count, 
           colour = article_id)) + 
  geom_line(aes(group = article_id)) +
  geom_point(size=1.3) +
  scale_color_gradient2(midpoint = 7000000, labels=comma) +
  labs(title = "Revisions by Month for 1000 Sampled Article IDs (2001-2008)", colour = "Article ID") +
  xlab("Year-Month") +
  ylab("Number of Revisions") 
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

# +
  # theme(plot.title = element_text(size = rel(1.3)), axis.ticks.length = unit(.25, "cm"))

ggsave('img/article-revisions-by-month-sample1.png', 
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

It could be interesting to look at which articles have such high amounts of revisions.

For now, let’s normalize by total number of revisions to get percentages.

percents <- collapsed %>%
  group_by(article_id) %>%
  mutate(percent = count/sum(count))

library(zoo)
# USE THIS
ggplot(data = percents, 
       aes(x=as.yearmon(date), 
           y = percent, 
           colour = article_id)) + 
  geom_line(aes(group = article_id)) + 
  geom_point() +
  geom_point(size=1.3) +
  scale_color_gradient2(midpoint = 7000000, labels=comma) +
  labs(title = "Revisions Per Month Normalized by Total Revisions for 1000 Sampled Article IDs (2001-2008)", colour = "Article ID") +
  xlab("Year-Month") +
  ylab("Percent of Article Revisions") 
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

ggsave('img/article-revisions-normalized-by-month-sample1.png', 
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

Plot this with first revision to get plot to prove our point about article ids and article age. Looks like articles that were created after 2006, might not have had time to show true trends of the lifecycle of the article. Let’s only look at records that were first edited before 2006.

# this won't work if the data isn't sorted by date
# date of first revision for each article id

# USE THIS - prepare background slide for why we cut off all 2003
t.first <- percents[!duplicated(percents$article_id),]
arrange(t.first, desc(date))
t.first.before2003 <- subset(t.first, as.yearmon(date) < as.yearmon("2003-01"))
arrange(t.first.before2003, desc(date))

So now we know which article_id’s were created before 2003. We need to subset the data for these article_ids

filtered_revisions <- percents[percents$article_id %in% t.first.before2003$article_id, ]
arrange(filtered_revisions, article_id)
ggplot(data = filtered_revisions, aes(x=as.yearmon(date), y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + geom_point()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

ggsave('img-other/articles-normalized-by-total-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
ggplot(data = filtered_revisions, aes(x=as.yearmon(date), y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + ylim(0,.5) + geom_point()
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 5 rows containing missing values (geom_point).

ggsave('img-other/articles-normalized-by-total-truncated-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 5 rows containing missing values (geom_point).

Convert records to time since first revision

aligned_revisions <- filtered_revisions
aligned_revisions$first <- t.first$date[match(aligned_revisions$article_id, t.first$article_id)]
aligned_revisions$time.since.creation <- (as.yearmon(aligned_revisions$date) - as.yearmon(aligned_revisions$first))*12
aligned_revisions$time.since.creation <- as.integer(round(aligned_revisions$time.since.creation))
library(scales)
ggplot(data = aligned_revisions, aes(x=time.since.creation/12, y = percent, colour = article_id)) + geom_line(aes(group = article_id)) + scale_y_continuous(labels = scales::percent) + geom_point()

ggsave('img-other/articles-time-since-first-creation-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)

Okay so there might be some trends that are obfuscated because we do not have 0% for months where there are no edits.

library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
all_months <- as.data.frame(dcast(setDT(aligned_revisions), article_id ~ time.since.creation, value.var='percent'))
all_months[is.na(all_months)] <- 0
head(all_months)
##   article_id            0 1           2 3 4            5 6 7           8 9
## 1         47 1.0000000000 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 2       3527 1.0000000000 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 3       6330 0.0285714286 0 0.000000000 0 0 0.0285714286 0 0 0.000000000 0
## 4      10864 0.0035971223 0 0.003597122 0 0 0.0000000000 0 0 0.003597122 0
## 5      20072 0.0434782609 0 0.000000000 0 0 0.0000000000 0 0 0.000000000 0
## 6      21494 0.0002515723 0 0.000000000 0 0 0.0002515723 0 0 0.000000000 0
##   10 11           12           13          14 15 16           17
## 1  0  0 0.0000000000 0.0000000000 0.000000000  0  0 0.0000000000
## 2  0  0 0.0000000000 0.0000000000 0.000000000  0  0 0.0000000000
## 3  0  0 0.0000000000 0.0000000000 0.000000000  0  0 0.0000000000
## 4  0  0 0.0035971223 0.0000000000 0.000000000  0  0 0.0000000000
## 5  0  0 0.0000000000 0.0000000000 0.000000000  0  0 0.0434782609
## 6  0  0 0.0002515723 0.0002515723 0.000754717  0  0 0.0002515723
##             18          19         20 21           22           23
## 1 0.0000000000 0.000000000 0.00000000  0 0.0000000000 0.0000000000
## 2 0.0000000000 0.000000000 0.00000000  0 0.0000000000 0.0000000000
## 3 0.0000000000 0.000000000 0.02857143  0 0.0000000000 0.0000000000
## 4 0.0000000000 0.000000000 0.00000000  0 0.0000000000 0.0035971223
## 5 0.0000000000 0.000000000 0.00000000  0 0.0000000000 0.0000000000
## 6 0.0005031447 0.000754717 0.00000000  0 0.0002515723 0.0005031447
##            24 25 26 27          28           29          30          31
## 1 0.000000000  0  0  0 0.000000000 0.0000000000 0.000000000 0.000000000
## 2 0.000000000  0  0  0 0.000000000 0.0000000000 0.000000000 0.000000000
## 3 0.000000000  0  0  0 0.000000000 0.0000000000 0.000000000 0.000000000
## 4 0.007194245  0  0  0 0.000000000 0.0000000000 0.000000000 0.000000000
## 5 0.000000000  0  0  0 0.000000000 0.0000000000 0.000000000 0.043478261
## 6 0.000000000  0  0  0 0.001257862 0.0002515723 0.002012579 0.001509434
##            32          33          34         35          36          37
## 1 0.000000000 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 2 0.000000000 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 3 0.000000000 0.000000000 0.085714286 0.05714286 0.057142857 0.000000000
## 4 0.007194245 0.000000000 0.000000000 0.00000000 0.000000000 0.000000000
## 5 0.000000000 0.000000000 0.043478261 0.00000000 0.000000000 0.043478261
## 6 0.001761006 0.008805031 0.004528302 0.00327044 0.005534591 0.005031447
##            38          39          40          41          42          43
## 1 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 3 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 4 0.000000000 0.000000000 0.000000000 0.003597122 0.010791367 0.010791367
## 5 0.000000000 0.043478261 0.043478261 0.000000000 0.000000000 0.043478261
## 6 0.008301887 0.005534591 0.003773585 0.002515723 0.007044025 0.007295597
##           44         45         46         47         48         49
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.00000000 0.00000000 0.14285714 0.05714286 0.02857143 0.05714286
## 4 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 5 0.00000000 0.00000000 0.00000000 0.08695652 0.00000000 0.00000000
## 6 0.00754717 0.01056604 0.01031447 0.01509434 0.01283019 0.01056604
##            50         51         52         53         54         55
## 1 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.028571429 0.00000000 0.00000000 0.00000000 0.00000000 0.02857143
## 4 0.003597122 0.00000000 0.04676259 0.00000000 0.01438849 0.03956835
## 5 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.04347826
## 6 0.011069182 0.02389937 0.02062893 0.01786164 0.02314465 0.01786164
##           56         57         58         59         60         61
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.02857143 0.00000000 0.02857143 0.00000000 0.02857143 0.02857143
## 4 0.05755396 0.03597122 0.01079137 0.01079137 0.03237410 0.01438849
## 5 0.00000000 0.08695652 0.00000000 0.00000000 0.08695652 0.00000000
## 6 0.02616352 0.03018868 0.01635220 0.01911950 0.02314465 0.05006289
##            62         63         64         65         66         67
## 1 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.000000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.000000000 0.02857143 0.02857143 0.00000000 0.00000000 0.00000000
## 4 0.007194245 0.01079137 0.03597122 0.00000000 0.06115108 0.05755396
## 5 0.130434783 0.13043478 0.04347826 0.00000000 0.00000000 0.00000000
## 6 0.062641509 0.02490566 0.06062893 0.04955975 0.04201258 0.05182390
##           68         69         70         71         72         73
## 1 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 3 0.00000000 0.02857143 0.08571429 0.00000000 0.05714286 0.00000000
## 4 0.05755396 0.01798561 0.07913669 0.10431655 0.03597122 0.04316547
## 5 0.00000000 0.00000000 0.04347826 0.00000000 0.00000000 0.00000000
## 6 0.05836478 0.04477987 0.02792453 0.02540881 0.04955975 0.02817610
##           74          75          76         77         78 79 80
## 1 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000  0  0
## 2 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000  0  0
## 3 0.02857143 0.000000000 0.000000000 0.00000000 0.00000000  0  0
## 4 0.03956835 0.032374101 0.014388489 0.02517986 0.05395683  0  0
## 5 0.00000000 0.000000000 0.000000000 0.00000000 0.00000000  0  0
## 6 0.04477987 0.005786164 0.004779874 0.00000000 0.00000000  0  0
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggvis)
## 
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
## 
##     resolution
## The following objects are masked from 'package:scales':
## 
##     fullseq, zero_range
df1 <- melt(all_months, "article_id")
df1$variable <- as.integer(df1$variable)

ggplot(data = df1, aes(x=variable, y = value, colour = article_id)) + geom_line(aes(group = article_id)) + scale_y_continuous(labels = scales::percent) + geom_point()

ggsave('img-other/articles-normalized-by-total-melted-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)

Okay so we could use a plot to show why we are throwing out articles that’s first revision was after some date. And we need to recalculate percents to be windows of 5 years. So percent of the revisions that occur in the first five years or two years?

ggplot(data = collapsed, aes(x=as.yearmon(date), y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_point(size = 1.3) + scale_color_gradient2(midpoint = 7000000)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

ggsave('img-other/why-throwout-articles-after-first-revision-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

Okay so we need to remove articles that were first revised after 5 years before last date (2008-01) on the date with the original counts, not percentages

filtered_revisions_counts <- collapsed[collapsed$article_id %in% t.first.before2003$article_id, ]
arrange(filtered_revisions_counts, article_id)
ggplot(data = filtered_revisions_counts, aes(x=as.yearmon(date), y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_vline(xintercept = as.yearmon("2003-01"), colour = "red") + scale_color_gradient2(midpoint = 7000000)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

ggsave('img-other/filtered-revision-counts-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

Okay. Now we want need to convert date to months since creations (so we can cap off the first 5 years). Convert records to time since first revision

aligned_revisions_counts <- filtered_revisions_counts
aligned_revisions_counts$first <- t.first$date[match(aligned_revisions_counts$article_id, t.first$article_id)]
aligned_revisions_counts$time.since.creation <- round((as.yearmon(aligned_revisions_counts$date) - as.yearmon(aligned_revisions_counts$first))*12)

library(scales)
ggplot(data = aligned_revisions_counts, aes(x=time.since.creation/12, y = count, colour = article_id)) + geom_line(aes(group = article_id)) + geom_point() + xlab("Years since First Revision")

ggsave('img-other/articles-months-since-first-revision-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)

This is really interesting that smaller article_ids (older articles?) were heavily edited around 4-6 years after first creation. Okay now we need to convert to wide format.

library(data.table)
all_months_counts <- as.data.frame(dcast(setDT(aligned_revisions_counts), article_id ~ time.since.creation, value.var='count'))
all_months_counts[is.na(all_months_counts)] <- 0
all_months_counts
##    article_id  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
## 1          47  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 2        3527  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 3        6330  1 0 0 0 0 1 0 0 0 0  0  0  0  0  0  0  0  0  0  0  1  0  0
## 4       10864  1 0 1 0 0 0 0 0 1 0  0  0  1  0  0  0  0  0  0  0  0  0  0
## 5       20072  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  1  0  0  0  0  0
## 6       21494  1 0 0 0 0 1 0 0 0 0  0  0  1  1  3  0  0  1  2  3  0  0  1
## 7       26582  3 1 0 1 0 0 0 1 1 1  1  2  0  0  1  0  0  0  1  3  1  1  3
## 8       28056  2 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 9       30059  1 0 0 0 0 0 2 0 0 0  0  3  3  4  1  0  3  1  0  0  1  2  0
## 10      31025  3 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 11      34422  1 0 0 0 0 0 0 0 0 3  0  0  0  0  1  0  0  1  5  3  1  0  1
## 12      34615  2 0 1 0 0 0 2 1 0 0  5  0  0  0  4  3  1  4  0  3  2  0  1
## 13      40908  1 0 0 1 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 14      54107  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 15      57122  4 0 3 0 0 0 0 0 1 0  0  1  0  0  1  0  0  0  0  0  2  4  0
## 16      78055  1 1 1 0 0 0 0 0 1 0  0  0  1  0  0  0  0  0  0  0  4  0  0
## 17      78932  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  1  0  0  1  0  0  1
## 18      84081  1 0 0 2 0 1 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  2  2
## 19      89237  2 0 0 0 0 0 1 0 0 1  0  0  0  0  0  1  0  0  0  0  0  0  6
## 20      98197  1 0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  1  0  0  0  0  0
## 21     101352  2 0 0 0 0 0 1 0 0 0  0  2  0  0  0  0  1  0  0  0  0  1  2
## 22     110683  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 23     110813  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 24     121543  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 25     123119  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  1  0  1
## 26     125806  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 27     127226  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 28     127246  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  1  0  0  0  0  1
## 29     127922  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  1  0  0  0  0  1
## 30     132879  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 31     133993  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  1
## 32     135427  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  1  0  1
## 33     136291  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  2  2
## 34     137704  2 0 0 0 2 0 0 0 0 0  2  1  0  0  0  0  0  0  0  0  2  0  0
## 35     139330  1 0 1 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0  0  2
## 36     151701  1 0 0 0 0 0 0 0 0 0  0  0  1  0  0  0  5  0  0  2  2  0  1
## 37     152509 12 0 0 0 0 0 0 0 5 0  0  4  0  0  0  0  0  1  1  1  0  2 12
## 38     155081  1 0 0 0 0 0 0 1 0 0  0  0  0  0  1  0  2  3  0  0  0  4  1
## 39     158625  2 0 0 0 0 0 0 1 0 0  0  1  0  0  2  0  0  0  1  0  0  0  1
## 40     161973  1 0 0 1 2 0 0 0 0 0  1  0  0  0  0  0  0  1  0  0  0  1  3
##    23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 1   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 2   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 3   0  0  0  0  0  0  0  0  0  0  0  3  2  2  0  0  0  0  0  0  0  0  0  5
## 4   1  2  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  1  3  3  0  0  0
## 5   0  0  0  0  0  0  0  0  1  0  0  1  0  0  1  0  1  1  0  0  1  0  0  0
## 6   2  0  0  0  0  5  1  8  6  7 35 18 13 22 20 33 22 15 10 28 29 30 42 41
## 7  10  3  0  5  0  2  4  3  2 11 11  2  0  6  7 10 10 11 10 11  8  7 13 26
## 8   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 9   0  3  1 11  1  2  0  0  8  2 20  4  5 12  2 33 20 23  2  7 10  6 27 16
## 10  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 11  0  0  2  1  1  0  2  1  0  2  1  0  7  6  3  2  3  2  3  1  1  3  4  4
## 12  1  1  2  2  6  2  3  5  3  4  6  5  2 15 11 13 13 17 38  8  7 11  3 34
## 13  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  2
## 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 15  1  5  2  0  4  1  0  1  1  0  4  2  1  0  0  4  1  2  0  9 31  8  3  2
## 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 17  0  0  1  1  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  2
## 18  0  0  1  0  0  1  0  1  3  1  0  1  2  1  1  0  3  0  3  0  0  1  0  0
## 19  0  3  0  1  0  0  0  0  3  2  3  0  0  0  1  2  0  5  2  0  0  0  0  0
## 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
## 21  0  1  1  2  2  2  0  2  0  0  2  0  1  1  0  1  0  1 11  3  0  1  1  5
## 22  0  1  4  1  0  0  0  0  0  0  0  0  1  0  0  1  0  2  1  1  1  0  0  0
## 23  0  0  2  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  1  1  0  0  0
## 24  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
## 25  0  0  3  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  1  1  0  0  0
## 26  0  1  1  0  0  0  0  0  0  1  0  0  0  0  0  3  0  0  1  0  1  0  1  0
## 27  0  1  1  2  0  0  3  0  0  0  0  0  1  0  1  0  0  0  1 16  1  0  1  0
## 28  0  0  3  1  0  0  0  0  0  0  1  0  0  0  0  0  0  1  3  1  1  4  0  1
## 29  0  0  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  2  1  0  0  0
## 30  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  0  1  0  0  0
## 31  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  0  0  0
## 32  0  1  3  1  0 30 14  6  9  9 30  0  1  1 14  0  6 16 10  4 15  2  3 34
## 33  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  2  0  1  9  4  1  1  3
## 34  1  0  0  1  2  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0
## 35  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  1
## 36  0  0  1  0  0  0  2  0  0  1  1  0  0  1  0  2  2  3  0  1  4  1  0  0
## 37  2  0  2  5  0  1  5  0  2  1  4  2  3  2  6 18  0  2  2  7  5  3  0  2
## 38  0  2  0  1  0  0  1  0  1  1  0  0  0  0  0  2  0  5  0  0  3  1  1  1
## 39  0  0  0  1  1  1  1  0  0  0  0  1  0  0  0  0  0  1  3  1  0  0  0  0
## 40  1  3  0  0  1  3  1  0  0  2  4  1  3  4  5  1  2  4  3  1  7  1  1  2
##    47 48 49 50  51 52  53 54 55  56  57 58  59 60  61  62  63  64  65  66
## 1   0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 2   0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 3   2  1  2  1   0  0   0  0  1   1   0  1   0  1   1   0   1   1   0   0
## 4   0  0  0  1   0 13   0  4 11  16  10  3   3  9   4   2   3  10   0  17
## 5   2  0  0  0   0  0   0  0  1   0   2  0   0  2   0   3   3   1   0   0
## 6  60 51 42 44  95 82  71 92 71 104 120 65  76 92 199 249  99 241 197 167
## 7  30 29 18 17  14 18  27 28 23  14   3 40  32 14  32  41  28  56  22  18
## 8   0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 9  28 17  8 47  33 38  38 51 33  58  62 36  34 45  29  59 111  59  71  86
## 10  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 11  1  1  8 10   4 14   4 13 16  22   8 12   6  8   7   2   7  12  38  45
## 12 89 35 54 58 110 36 129 21 27  17  38 64 119 93  95  92  97 103  87  17
## 13  0  0  0  0   0  2   0  0  0   0   2  0   2  0   1   0   0   0   0   0
## 14  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 15 10 10  5  6  13 21  13  0  6   2   9  1   5  1   5   2  12  14  17   8
## 16  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 17  0  0  0  0   0  0   0  2  0   2   1  0   0  1   1   0   0   0   0   0
## 18  1  0  0  1   0  0   2  4  1   1   2  1   1  1   1   2   0   2   0   0
## 19  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 20  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   0   0   0   0   0
## 21  2  0  0  0   0  1   1  0  0   3   2 13   2  1   3   4   1   0   0   0
## 22  0  1  0  0   0  0   0  0  2   3   0  0   2  5   0   1   0   0   0   0
## 23  0  0  0  1   0  0   1  0  0   1   0  1   2  2   2   1   0   0   0   0
## 24  0  0  1  0   0  0   0  0  0   0   0  0   0  1   0   1   0   0   0   0
## 25  0  0  0  0   0  0   0  0  1   1   0  0   2  3   0   1   0   0   0   0
## 26  0  3  1  0   0  2   1  0  0   0   0  1   0  2   3   1   0   0   0   0
## 27  0  0  5  0   0  0   0  0  0   1   0  1   1  2   1   2   0   0   0   0
## 28  0  0  3  0   0  0   0  0  0   6   8  3   3  2   3   5   1   0   0   0
## 29  0  0  0  0   0  2   0  0  1   0   1  0   2  2   3   1   0   0   0   0
## 30  0  6  0  0   0  0   0  0  0   0   0  1   0  0   0   1   0   0   0   0
## 31  0  0  0  0   0  0   0  0  0   0   0  0   0  0   0   1   0   0   0   0
## 32  2  6  7  0   5  9  22 10  9   4   8 17   5  9  15   4   1   0   0   0
## 33  0  2  1  1   0  1   3  0  4   2   3  1   3  1   0   2   0   0   0   0
## 34  0  2  1  0   0  1   0  1  0   0   0  0   0  0   1   0   0   0   0   0
## 35  0  0  0  1   0  1   1  0  0   0   0  0   0  1   0   1   1   0   0   0
## 36  2  2  3  4   0  1   2  1  0   1  11  8   0  0   3   0   0   0   0   0
## 37  3  1  3  1   6  8  10  9  1   3   9 14   8  5   5   0   0   0   0   0
## 38  1  0  2  0   2  3   1  1  0   0   0  0   0  0   0   0   0   0   0   0
## 39  0  1  1  0   0  0   0  0  0   1   0  0   1  0   0   0   0   0   0   0
## 40  2  2  5 10  13 11  18  3 11   6  17 11   3  0   7   0   0   0   0   0
##     67  68  69  70  71  72  73  74 75 76 77 78 79 80
## 1    0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 2    0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 3    0   0   1   3   0   2   0   1  0  0  0  0  0  0
## 4   16  16   5  22  29  10  12  11  9  4  7 15  0  0
## 5    0   0   0   1   0   0   0   0  0  0  0  0  0  0
## 6  206 232 178 111 101 197 112 178 23 19  0  0  0  0
## 7    8  19  19  29  19   1   0   0  0  0  0  0  0  0
## 8    0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 9  110 100  87  59  11  25  47  86 35 33  0  0  0  0
## 10   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 11  19  25  47  28  13  27   5   2 18 14 48 33 22  1
## 12   5  19  36  89  80  35   4   0  0  0  0  0  0  0
## 13   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 14   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 15   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 16   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 17   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 18   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 19   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 20   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 21   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 22   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 23   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 24   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 25   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 26   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 27   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 28   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 29   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 30   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 31   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 32   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 33   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 34   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 35   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 36   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 37   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 38   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 39   0   0   0   0   0   0   0   0  0  0  0  0  0  0
## 40   0   0   0   0   0   0   0   0  0  0  0  0  0  0
library(reshape2)
library(ggvis)

df1_counts <- melt(all_months_counts, "article_id")
df1_counts$variable <- as.integer(df1_counts$variable)

ggplot(data = df1_counts, aes(x=variable, y = value, colour = article_id)) + geom_line(aes(group = article_id))

ggsave('img-other/articles-months-since-first-revision-melted-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)

Okay. So since the cut-off date was 2003-01, then the maximum amount of time for revisions is 5 years. Thus we should cut off all the data after 60 months since first revision. Then convert the values to percentages of all revisions.

data_5yrs <- all_months_counts[,0:62]
data_5yrs_norm <- data_5yrs
data_5yrs_norm$totalrevisions <- rowSums(data_5yrs[,2:62])
data_5yrs_norm[,2:62] <- data_5yrs[,2:62]/data_5yrs_norm$totalrevisions
df1_5yrs <- melt(data_5yrs_norm, c("article_id", "totalrevisions"))
df1_5yrs$variable <- as.integer(df1_5yrs$variable)

ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=totalrevisions), size = 0.4) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision")

ggsave('img-other/articles-normalized-by-total-5years-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)

I want to discretize the number of total revisions

data_5yrs_norm$bins <- cut(data_5yrs_norm$totalrevisions, breaks=c(0,2,10,25,100, 500, 2000), labels=c("1-2", "3-10", "10-25", "25-100", "100-500", "500-"))

df1_5yrs <- melt(data_5yrs_norm, c("article_id", "totalrevisions", "bins"))
df1_5yrs$variable <- as.integer(df1_5yrs$variable)
ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=bins, alpha = 0.5), size = 0.5) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + scale_color_hue(direction = -1) + facet_grid(~bins) +  theme(legend.position="none")

ggsave('img-other/articles-normalized-by-total-5years-facet-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
ggplot(data = df1_5yrs, aes(x=variable, y = value)) + geom_line(aes(group = article_id, color=bins, alpha = 0.4), size = 0.5) + scale_y_continuous(labels = scales::percent) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + scale_color_hue(direction = -1) + labs(color = "Total Article Revisions")

ggsave('img-other/articles-normalized-by-total-5years-faceted-subsample1.png',
       plot = last_plot(),
       width = 10,
       height = 6)
factorlevels <- levels(df1_5yrs$bins)
colorsused <- scales::hue_pal(direction = -1)(length(factorlevels))
counter <- 1

for (f in factorlevels){
  print(ggplot(data = df1_5yrs[df1_5yrs$bins == f,], aes(x=variable, y = value)) + geom_line(aes(group = article_id), color=colorsused[counter], size = 0.6, alpha = 0.5) + scale_y_continuous(labels = scales::percent, limits = c(0,1)) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + facet_grid(~bins) + theme(legend.position="none") )
  counter <- counter +1
  
  ggsave(paste0("img-other/articles-normalized-by-total-5years-facet-", f, "-subsample1.png"),
       plot = last_plot(),
       width = 10,
       height = 6)
}

What’s happening with articles that have over 500 revisions

factorlevels <- levels(df1_5yrs$bins)
colorsused <- scales::hue_pal(direction = -1)(length(factorlevels))
counter <- 1
for (f in factorlevels){
  print(ggplot(data = df1_5yrs[df1_5yrs$bins == f,], aes(x=variable, y = value)) + geom_line(aes(group = article_id), color=colorsused[counter], size = 0.6, alpha = 0.5) + scale_y_continuous(labels = scales::percent, limits = c(0,1)) + ylab("Percent of Article Revisions") + xlab("Months since First Revision") + facet_grid(~bins) + theme(legend.position="none") )
  counter <- counter +1
  
  ggsave(paste0("img-other/articles-normalized-by-total-5years-facet-over500revis-", f, "-subsample1.png"),
       plot = last_plot(),
       width = 10,
       height = 6)
  
}